##########################################################################
# Description: Replicates appendix                                       #
# Author: Ozlem Tuncel                                                   #
# Title: Lecturer and Data Services Specialist                           #
# Affiliation: Georgia State University                                  #
# Department: Research Data Services & Political Science                 #
# Email: otuncelgurlek1@gsu.edu                                          #
# Date: 08/04/2025                                                       #
# R Version: R version 4.4.0 (2024-04-24 ucrt) -- "Puppy Cup"            #
# Computer: Dell Latitude 7450 x64-based PC                              #
# Processor: Intel(R) Core(TM) Ultra 7 165U, 2.10 GHz, 12 Cores          #
# BIOS version/date: Dell Inc. 1.3.0, 4/11/2024                          #
# OS: 22621.3880                                                         #
##########################################################################

# Clean slate
rm(list = ls())
set.seed(1234)
getwd()
# setwd() # use this to set the working directory

# Load library ----
library(tidyverse)  # version 2.0.0
library(xtable)     # version 1.8-4
library(tidytext)   # version 0.4.2
library(tm)         # version 0.7-13
library(wordcloud)  # version 2.6
library(kableExtra) # version 1.4.0
# install.packages("remotes")
# remotes::install_github("davidsjoberg/ggsankey")
library(ggsankey)   # version 0.0.99999

# Import data ----
my_data <- read_csv("replication_data/elite_interview_data.csv")

## Appendix Figure 1 ----

# Distribution of articles within subfield, type of evidence, and author count
sankey_data <- my_data |> 
  select(article_title, type_of_evidence, subfield1, author2) |> 
  mutate(author_count = ifelse(is.na(author2), "Single-author", "Multi-author"),
         type_of_evidence = ifelse(type_of_evidence == "qualitative", 
                                   "Qualitative", "Mixed-Methods")) |> 
  rename(`Type of Evidence` = type_of_evidence,
         `Subfield` = subfield1,
         `Author Count` = author_count) |> 
  select(-author2) |> 
  make_long(`Type of Evidence`, `Subfield`, `Author Count`)

# Calculate counts and percentages
dagg <- sankey_data |> 
  group_by(node) |> 
  tally() |> 
  mutate(percentage = n / sum(n) * 100)

# Merge data
final_sankey <- merge(sankey_data, dagg, by.x = 'node', 
                      by.y = 'node', all.x = TRUE)

# Create the Sankey diagram
sankey_figure <- ggplot(final_sankey,
                        aes(x = x,
                            next_x = next_x,
                            node = node,
                            next_node = next_node,
                            fill = factor(node),
                            label = paste0(node, "\n n = ", n))) +
  geom_sankey(flow.alpha = 0.5, node.color = "black") +
  geom_sankey_label() +
  theme_sankey(base_size = 16) +
  scale_fill_brewer(palette = "Set2") +
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text.y = element_blank(), 
        axis.ticks = element_blank(), 
        panel.grid = element_blank()) +
  guides(fill = guide_legend(title = "Nodes"))

sankey_figure

ggsave(plot = sankey_figure, file = "figures/Appendix_Figure1.png", 
       dpi = 300, width = 6, height = 4)

## Appendix Figure 2 ----
# Wordcloud of interviewee description

# Create a dataframe containing only the text and filter out NA values
text_df <- my_data |> 
  filter(!is.na(elite_description)) |> 
  select(elite_description)

# Create a tidy text dataframe
tidy_text <- text_df |> 
  unnest_tokens(word, elite_description) |> 
  mutate(word = str_to_lower(word)) |> 
  filter(!word %in% stop_words$word) |>
  filter(!str_detect(word, "\\d")) |> 
  filter(!str_detect(word, "[[:punct:]]")) |> 
  filter(!word %in% c("elites", "elite", "also", "and", "other", "including")) 

# Create a frequency table
word_freq <- tidy_text |> count(word, sort = TRUE)

# Create a word cloud
set.seed(1234) # for reproducibility
wordcloud(words = word_freq$word, 
          freq = word_freq$n, 
          min.freq = 2,
          max.words = 200, 
          random.order = FALSE, 
          rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

## Appendix Table 2 ----
# Display the top 10 words
word_freq |> head(10) |> kable() |> kable_styling()

# Display the top 20 words in a table
word_freq |> head(20) |> kable() |> kable_styling()


## Appendix Figure 3 ----
# Publishing on elites interviews over the years by journal

# Define the full sequence of years from 2000 to 2023
full_years <- expand.grid(journal = unique(my_data$journal), year = 2000:2023)

# Join the data with the full sequence of years and fill missing values with 0
my_data_complete <- my_data |> 
  select(journal, year) |> 
  group_by(journal, year) |> 
  summarize(n = n(), .groups = "drop") |>
  right_join(full_years, by = c("journal", "year")) |> 
  replace_na(list(n = 0))

# Now you can use this updated dataset in your plot
appendix_trend_fig <- ggplot(my_data_complete) +
  geom_line(aes(x = year, y = n)) +
  geom_point(aes(x = year, y = n)) +
  facet_wrap(~journal, ncol = 2) +
  scale_y_continuous(breaks = seq(0, 6, by = 2)) +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2020, 2023)) +
  labs(y = "Article Count", x = "Year") +
  theme_bw()

appendix_trend_fig

ggsave(appendix_trend_fig, file = "figures/Appendix_Figure3.png",
       dpi = 300, width = 10, height = 6)

## Appendix Figure 4 ----
# Topic trends in each journal over the years

# Define the full sequence of years from 2000 to 2023
full_years_topic <- expand.grid(topic = unique(my_data$topic), 
                                year = 2000:2023)

# Join the data with the full sequence of years and fill missing values with 0
my_data_topic <- my_data |> 
  select(topic, year) |> 
  group_by(topic, year) |> 
  summarize(n = n(), .groups = "drop") |>
  right_join(full_years_topic, by = c("topic", "year")) |> 
  replace_na(list(n = 0))

# Now you can use this updated dataset in your plot
topic_trend_fig <- ggplot(my_data_topic) +
  geom_line(aes(x = year, y = n)) +
  geom_point(aes(x = year, y = n)) +
  facet_wrap(~topic, ncol = 2) +
  scale_y_continuous(breaks = seq(0, 6, by = 2), limits = c(0, 6)) +
  scale_x_continuous(breaks = c(2000, 2005, 2010, 2015, 2020, 2023)) +
  labs(y = "Article Count", x = "Year") +
  theme_bw()

topic_trend_fig

ggsave(topic_trend_fig, file = "figures/Appendix_Figure4.png",
       dpi = 300, width = 10, height = 6)

## Appendix Figure 5 ----
# Methods mentioned in each journal
my_data[my_data$evidence_details == "elite interviews (only)", ]

# Separate each value in 'evidence_details' by comma
my_data_separated <- my_data |> 
  separate_rows(evidence_details, sep = ",") |> 
  mutate(evidence_details = trimws(evidence_details)) # trim any extra spaces

# Count the frequency of each methodological approach
method_freq <- my_data_separated |> 
  group_by(evidence_details) |> 
  summarize(frequency = n()) |> 
  mutate(frequency = ifelse(evidence_details == "elite interviews", 
                            9, frequency)) |> 
  arrange(desc(frequency))

# Display the frequency table
print(method_freq)

# Create a plot from method_freq
method_fig <- method_freq |> 
  ggplot(aes(x = reorder(evidence_details, frequency), y = frequency)) +
  geom_col() +
  coord_flip() +
  labs(x = "Methodological Approach",
       y = "Frequency") +
  scale_y_continuous(breaks = seq(0, 40, by = 5), limits = c(0, 35)) +
  theme_minimal()

method_fig

ggsave(method_fig, file = "figures/Appendix_Figure5.png",
       dpi = 300, width = 10, height = 6)

## Appendix Figure 6 ----
# Anonymity, Appendix, and IRB decisions based on main subfields over the years
subfield_appendix <- my_data |> 
  select(year, appendix_interviews, IRB, anonym_explanation, subfield1) |> 
  mutate(new_IRB = ifelse(IRB == "Yes", 1, 0),
         anonym = ifelse(anonym_explanation == "No", 0, 1)) |> 
  group_by(year, subfield1) |> 
  summarize(
    total_entries = n(),
    count_appendix_interviews = sum(!is.na(appendix_interviews)),
    count_anonymity = sum(anonym),
    count_IRB = sum(!is.na(IRB))
  )

subfield_fig <- subfield_appendix |> 
  ggplot() +
  geom_line(aes(x = year, y = count_appendix_interviews, 
                linetype ="Appendix Exists"), linewidth = 1) +
  geom_line(aes(x = year, y = count_IRB, linetype = "IRB Approved"), 
            linewidth = 1) +
  geom_line(aes(x = year, y = count_anonymity, 
                linetype = "Anonymity Explained"), linewidth = 1) +
  labs(x = "Year", y = "Frequency", linetype ="") +
  scale_x_continuous(breaks = c(2000, 2002, 2005, 2010, 2015, 2020, 2023)) +
  scale_y_continuous(breaks = c(0, 2, 4, 6, 8, 10, 12, 14)) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_linetype_manual(values = c("solid", "dashed", "dotted")) +
  facet_wrap(~subfield1, ncol = 1)

subfield_fig

ggsave(subfield_fig, file = "figures/Appendix_Figure6.png",
       dpi = 300, width = 10, height = 6)
